
library(tidyverse)
library(openxlsx)


setwd("C:/Users/623968/OneDrive - BOOZ ALLEN HAMILTON/Documents/R")

# Source: https://github.com/cdcepi/zika/tree/master/Colombia/Municipality_Zika/data

# Colombia Antioquia municipality Zika cases, 2016
df = read.csv("Municipality_Zika_2016-01-09.csv")

# Data exploration
str(df)
summary(df)
head(df)


# Cleaning
df = df %>% mutate(report_date = as.Date(report_date))

str(df)

# Set up summary stats and description columns -----------
df_nums = df %>% 
  select(where(is.numeric)) %>% # Selects all numeric columns
  pivot_longer(cols = everything(), # Transforms dataframe to two columns: 1 with column names, one with column values
               names_to = "Variable Name",
               values_to = "values") %>%
  group_by(`Variable Name`) %>%
  summarise(
    `Variable Description` = NA, # Column to describe what the variable is
    `Why data element was collected` = NA, # Column for why was this variable collected
    `How data element was collected` = NA, # Column for collection methods used to collect the variable
    `Variable Type` = class(values), # Column for data type
    N = n(), # Sample size
    `N missing` = sum(is.na(values)), # Count of missing (NA) values
    min = min(values),  # Summary stats
    Q1=quantile(values, probs = 0.25),
    median = median(values),
    mean = mean(values),
    Q3=quantile(values, probs = 0.75),
    max = max(values),
    IQR = Q3-Q1,
    Units = NA,
    values = paste(min, max, sep = " - ") # Range
    )

df_dates = df %>% select(where(is.Date)) %>%
  pivot_longer(cols = everything(),
               names_to = "Variable Name",
               values_to = "values") %>%
  group_by(`Variable Name`) %>%
  summarise(min = min(values),
            max = max(values)) %>%
  pivot_longer(cols = 2:3,
               names_to = "minmax",
               values_to = "values") %>%
  mutate(
    `Variable Description` = NA,
    `Why data element was collected` = NA,
    `How data element was collected` = NA,
    `Variable Type` = class(values),
    values = paste0(values, collapse = " - "),
    `Variable Description` = NA,
    Units = "Days") %>%
  distinct(`Variable Name`, .keep_all = T) %>% # Remove duplicates
  select(-minmax)

head(df %>% select(where(is.character)))

df %>% summarise(unique(length(location)))

df_cats = df %>% select(where(is.character)) %>%
  select(3:5) %>% # Removes columns about location as they will be stored separately
  distinct() %>%
  pivot_longer(cols = everything(),
               names_to = "Variable Name",
               values_to = "values") %>%
  distinct(values,.keep_all = T) %>%
  group_by(`Variable Name`) %>%
  mutate(values = paste0(values, collapse = "; "), # Stores all values per group in one one row
         `Variable Description` = NA, # Add data documentation columns
         `Why data element was collected` = NA,
         `How data element was collected` = NA,
         `Variable Type` = class(values),
  )

view(df_dict)  

df_dict = bind_rows(df_nums,df_cats,df_dates) %>% # Pulls all objects together
  distinct()

# Add in variable descriptions
df_dict = df_dict %>% 
  mutate(
    `Variable Description` = case_when(`Variable Name` == "report_date" ~ "Date when report was published",
                                       `Variable Name` == "country" ~ "Country in which case was reported",
                                       `Variable Name` == "department" ~ "Department in which case was reported",
                                       `Variable Name` == "city_town" ~ "City, town, or pueblo in which case was reported",
                                       `Variable Name` == "location_type" ~ "Description of regional location",
                                       `Variable Name` == "data_field" ~ "Type of case",
                                       `Variable Name` == "data_field_code" ~ "A specific code for each type of case",
                                       `Variable Name` == "value" ~ "The number of cases per data field type",
                                       `Variable Name` == "unit" ~ "The unit in which cases are reported",
                                       .default = as.character(`Variable Description`)
    )) %>%
  mutate(Units = ifelse(`Variable Name` == "value", replace_na(Units, "cases"), Units))

# Places tab
df_places = df %>%
  select(location, location_type) %>%
  separate_wider_delim(
    location, "-", names = c("country", "state_province", "district_county_municipality"), cols_remove = F) %>%
  mutate(altname1 = row_number()) %>%
  mutate(altname1 = altname1 - 1 + 5000) %>%
  relocate(location, location_type)

# Codes tab
df_codes = df %>% select(data_field, data_field_code) %>% distinct()

# Export
sheet_names <- list('Data Dictionary' = df_dict, 'Places (Colombia)' = df_places, "Code Guide" = df_codes)

write.xlsx(sheet_names, file = 'Zika Data Dictionary DEMO.xlsx') 





